In [1]:
#!pip install -r requirements.txt
In [2]:
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

pd.options.plotting.backend = "plotly"
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)
import plotly_express as px
import plotly.figure_factory as ff
from IPython.display import Image

from src.wk_means import WKMeans
from src.clustering_utils import *

In this notebook, the goal is to apply the Wasserstein k-means market regime clustering on the sp500 data.

0. Load index data

In [3]:
index_data = pd.read_csv('data/SPY.csv', index_col=0, parse_dates=True)['Adj Close'].rename('SPY')
In [4]:
fig = index_data.plot()
fig.update_layout(xaxis_title='date',
                  yaxis_title = 'raw',)
fig.show()

fig = index_data.apply(np.log).diff().plot()
fig.update_layout(xaxis_title='date',
                  yaxis_title='log returns',)
fig.show()

Next, we generate our groun truth data that will serve as benchmark.

1. WK means

A bit of theory :

Each observation represents a distribution. An emprical measure of a distributin is defined as : $$ \mu^{r}((-\infty, x])=\frac{1}{N} \sum_{i=1}^{N} \chi_{\left\{Q^{i}(r) \leq x\right)}(x), $$

where $\chi: \mathbb{R} \rightarrow[0,1]$ is the indicator function and $Q^{j}: \mathcal{S}(\mathbb{R}) \rightarrow \mathbb{R}$ is j_th order stat for j in 1..N.


The Wasserstein distance between two empirical measures is defined as :

$$ \mathcal{W}_{p}(\mu, \nu):=\min _{\mathbb{P} \in \Pi(\mu, \nu)}\left\{\int_{X \times X} d(x, y)^{p} \mathbb{P}(d x, d y)\right\}, $$

where $\Pi(\mu, \nu):=\{\mathbb{P} \in \mathcal{P}(X \times X): \mathbb{P}(A \times X)=\mu(A), \quad \mathbb{P}(X \times B)=\nu(B)\} $ is the set of transport plans between $\mu$ and $\nu$.

and $$ d(x,y) = ||x-y||_X $$

You can see it as the minimum "cost" of turning one distribution into the other.


However, we can compute the W distance between two emprical measures using this simplification :

$$ \mu((-\infty, x])=\frac{1}{N} \sum_{i=1}^{N} \chi_{\alpha_{i} \leq x}(x), \quad \nu((-\infty, x])=\frac{1}{N} \sum_{i=1}^{N} \chi_{\beta_{i} \leq x}(x) $$

where $\left(\alpha_{i}\right)_{1 \leq i \leq N}$ and $\left(\beta_{i}\right)_{1 \leq i \leq N}$ are increasing sequences corresponding to the atoms of $\mu$ and $\nu$. (cf $\frac{1}{N} \sum_{i=1}^{N} \chi_{\left\{Q^{i}(r) \leq x\right)}(x)$)

$$ \begin{aligned} \mathcal{W}_{p}(\mu, \nu)^{p} &=\frac{1}{N} \sum_{i=1}^{N}\left|\alpha_{i}-\beta_{i}\right|^{p} \end{aligned} $$

When p=1, the Wassertstein distance is equivalent to the Earth Mover's Distance.

In [5]:
Image(filename='src/EMD.jpg')
Out[5]:
In [6]:
h1 = 20 #stride # 20 days
h2 = 1 #step

stream = index_data.apply(np.log).diff().ffill().dropna()
features_wk_means, lift = partition(stream, h1, h2, return_lift=True)
In [7]:
features_wk_means
Out[7]:
alpha_1 alpha_2 alpha_3 alpha_4 alpha_5 alpha_6 alpha_7 alpha_8 alpha_9 alpha_10 alpha_11 alpha_12 alpha_13 alpha_14 alpha_15 alpha_16 alpha_17 alpha_18 alpha_19 alpha_20
Date
2000-02-09 -0.031691 -0.028752 -0.015425 -0.012038 -0.009998 -0.007957 -0.007899 -0.004155 -0.004003 -0.002161 -0.001536 0.000887 0.008112 0.009804 0.011292 0.013452 0.013486 0.013517 0.014952 0.026777
2000-02-10 -0.031691 -0.028752 -0.021229 -0.015425 -0.009998 -0.007957 -0.007899 -0.004155 -0.004003 -0.002161 -0.001536 0.000887 0.008112 0.009804 0.011292 0.013452 0.013486 0.013517 0.014952 0.026777
2000-02-11 -0.031691 -0.028752 -0.021229 -0.015425 -0.007957 -0.007899 -0.004155 -0.004003 -0.002161 -0.001536 0.000887 0.001988 0.008112 0.009804 0.011292 0.013452 0.013486 0.013517 0.014952 0.026777
2000-02-14 -0.031691 -0.028752 -0.021229 -0.020518 -0.015425 -0.007957 -0.007899 -0.004155 -0.004003 -0.002161 -0.001536 0.000887 0.001988 0.008112 0.009804 0.011292 0.013486 0.013517 0.014952 0.026777
2000-02-15 -0.031691 -0.028752 -0.021229 -0.020518 -0.015425 -0.007957 -0.007899 -0.004155 -0.004003 -0.002161 -0.001536 0.000887 0.001988 0.005842 0.008112 0.009804 0.011292 0.013517 0.014952 0.026777
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2020-12-24 -0.009009 -0.004487 -0.004437 -0.004001 -0.003582 -0.002057 -0.001687 -0.001543 -0.001173 -0.000327 -0.000273 0.000898 0.001568 0.002102 0.002781 0.002922 0.005576 0.008581 0.010878 0.013429
2020-12-28 -0.009009 -0.004487 -0.004437 -0.004001 -0.003582 -0.002057 -0.001687 -0.001173 -0.000327 -0.000273 0.000898 0.001568 0.002102 0.002781 0.002922 0.003883 0.005576 0.008581 0.010878 0.013429
2020-12-29 -0.009009 -0.004487 -0.004437 -0.004001 -0.003582 -0.002057 -0.001687 -0.001173 -0.000327 -0.000273 0.000898 0.001568 0.002102 0.002922 0.003883 0.005576 0.008554 0.008581 0.010878 0.013429
2020-12-30 -0.009009 -0.004487 -0.004001 -0.003582 -0.002057 -0.001910 -0.001687 -0.001173 -0.000327 -0.000273 0.000898 0.001568 0.002102 0.002922 0.003883 0.005576 0.008554 0.008581 0.010878 0.013429
2020-12-31 -0.009009 -0.004487 -0.004001 -0.003582 -0.002057 -0.001910 -0.001687 -0.001173 -0.000327 -0.000273 0.000898 0.001426 0.001568 0.002102 0.002922 0.003883 0.005576 0.008554 0.008581 0.013429

5258 rows × 20 columns

Now we will initialize and fit a WK means model :

In [8]:
estimator_wk_means = WKMeans(n_clusters=2, random_state=2022)
estimator_wk_means.fit(features_wk_means)
Out[8]:
WKMeans(random_state=2022)
In [9]:
labels_wk_means = create_labels(estimator_wk_means, features_wk_means)
In [10]:
fig = plot_regime_time_series(labels_wk_means, index_data, return_fig=True)
fig.show()

To have another view on the clustering, we plot each set of returns in the mean/vol space :

In [11]:
# Here, the centroids are calculated as the Wasserstein barycenter of each cluster
# then, for each barycenter, we compute the mean and the std

mean_volatility_plot(lift,labels_wk_means, estimator_wk_means.cluster_centers_, np.array(estimator_wk_means.cluster_centers_).mean(axis=1),
                    np.array(estimator_wk_means.cluster_centers_).std(axis=1))